Problem Overview

We are using the spotify dataset https://rdrr.io/cran/bayesrules/man/spotify.html. This dataset include 350 songs and 23 variables. In this problem, we try to use the song artist and the danceability of the song to predict the popularity. In below, I would explore using normal regression and hierarchical Bayesian modelling.

Setup

library(tidyverse)
library(infer)
library(broom)
library(rstan)
library(rstantools)
library(cowplot)
library(akima)
library(infer)
library(bayesplot)
library(bayesrules)
rstan_options(auto_write = TRUE)  # To save some compiling

1. Data Wrangling

  1. We only need the column artist, danceability and popularity
spotify_training <-spotify %>%
  select(artist, danceability, popularity)
spotify_training
artist_catalogue <- tibble(
  artist = levels(spotify_training$artist),
  code = 1:44
)

2. Exploratory Data Analysis

scatter_plot <- ggplot(spotify_training, aes(x = danceability, y = popularity)) +
  geom_point() +  # Scatterplot
  labs(x = "Danceability", y = "Popularity") +  # Axes labels
  ggtitle("Scatterplot of Popularity vs Danceability") +  # Plot title
  theme_minimal()  # Minimal theme (adjust as needed)

scatter_plot 

popularity_boxplots <- spotify_training %>%
  ggplot(aes(reorder(artist, popularity), popularity)) +
  geom_boxplot(aes(fill = artist)) +
  scale_fill_viridis_d(option="plasma")+
  labs(y = "Popularity Score", x = "Artist") +
  ggtitle("Side-by-Side Boxplots of Popularity by Artist") +
  theme(
    plot.title = element_text(size = 18, face = "bold"),
    axis.text = element_text(size = 14, angle = 90),
    axis.title = element_text(size = 14),
    legend.position = "none"
  )

popularity_boxplots

3. Modelling with a linear regression

regression_model <- lm(popularity ~ danceability + artist, data = spotify_training)
summary(regression_model)
## 
## Call:
## lm(formula = popularity ~ danceability + artist, data = spotify_training)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -51.412  -4.060   1.012   7.493  26.100 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               61.98340    5.48109  11.309  < 2e-16 ***
## danceability               0.04229    0.06568   0.644 0.520124    
## artistAtlas Genius       -18.88721    7.71414  -2.448 0.014913 *  
## artistAu/Ra               -3.71583    7.06034  -0.526 0.599065    
## artistBeyoncé              5.17710    4.28947   1.207 0.228393    
## artistBlack Stone Cherry -14.63669    6.75654  -2.166 0.031062 *  
## artistBUNT.              -21.75197    8.78331  -2.477 0.013809 *  
## artistC-Kan              -14.56436    7.71530  -1.888 0.060012 .  
## artistCamila Cabello      12.18058    3.93876   3.092 0.002168 ** 
## artistCamilo              15.96125    5.67768   2.811 0.005255 ** 
## artistChris Goldarg      -48.68593    5.48680  -8.873  < 2e-16 ***
## artistDA Image           -28.37020    8.70957  -3.257 0.001251 ** 
## artistDavid Lee Roth     -12.52655    8.71810  -1.437 0.151787    
## artistElisa              -18.00354    8.80120  -2.046 0.041655 *  
## artistFrank Ocean          5.22332    3.92970   1.329 0.184778    
## artistFreestyle          -31.97921    8.79336  -3.637 0.000324 ***
## artistHinder              -3.25652    6.71453  -0.485 0.628028    
## artistHoneywagon         -33.28139    8.70587  -3.823 0.000160 ***
## artistHouse Of Pain      -17.43234    7.14828  -2.439 0.015311 *  
## artistJ. Cole             11.78451    5.16661   2.281 0.023244 *  
## artistJazzinuf           -18.96459    6.56148  -2.890 0.004125 ** 
## artistJean Juan          -27.90192    7.04550  -3.960 9.32e-05 ***
## artistKendrick Lamar      -7.92359    4.09152  -1.937 0.053719 .  
## artistKid Frost          -24.76775    8.75278  -2.830 0.004968 ** 
## artistLeĂłn Larregui        0.69119    8.70985   0.079 0.936801    
## artistLil Skies           14.46986    8.70437   1.662 0.097467 .  
## artistMia X              -51.83339    7.71668  -6.717 9.08e-11 ***
## artistMichael Kiwanuka   -10.90358    8.71454  -1.251 0.211823    
## artistMike WiLL Made-It   -9.43590    6.23418  -1.514 0.131169    
## artistMissy Elliott       -3.78958    7.09328  -0.534 0.593558    
## artistMTK                -20.70440    7.70955  -2.686 0.007637 ** 
## artistNODE               -11.65463    7.04987  -1.653 0.099325 .  
## artistPlacebo             -8.23114    6.72302  -1.224 0.221776    
## artistRöyksopp           -31.30792    7.72020  -4.055 6.36e-05 ***
## artistSean Kingston        7.63114   10.44754   0.730 0.465691    
## artistSoul&Roll          -41.00181    7.06426  -5.804 1.62e-08 ***
## artistSufjan Stevens       1.55881    8.77869   0.178 0.859181    
## artistTamar Braxton      -23.31291    8.80678  -2.647 0.008539 ** 
## artistThe Blaze           -2.23813    7.72715  -0.290 0.772284    
## artistThe Wrecks         -10.10017    8.72741  -1.157 0.248058    
## artistThe xx              -9.75650    5.92209  -1.647 0.100490    
## artistTV Noise           -27.19308    4.99412  -5.445 1.07e-07 ***
## artistVampire Weekend     -1.32989    6.60917  -0.201 0.840661    
## artistX Ambassadors       -6.72346    6.32833  -1.062 0.288877    
## artistZeds Dead          -13.79065    7.07695  -1.949 0.052251 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.01 on 305 degrees of freedom
## Multiple R-squared:  0.5973, Adjusted R-squared:  0.5393 
## F-statistic: 10.28 on 44 and 305 DF,  p-value: < 2.2e-16
# Scatterplot with regression lines by artist and smaller legend
scatter_plot <- ggplot(spotify_training, aes(x = danceability, y = popularity, color = artist)) +
  geom_point() +  # Scatterplot
  geom_smooth(method = "lm", formula = y ~ x, se = FALSE) +  # Regression lines without confidence intervals
  labs(x = "Danceability", y = "Popularity", title = "Scatterplot of Popularity vs Danceability by Artist") +
  theme_minimal() +
  scale_color_discrete(name = "Artist") +  # Legend title
  theme(
    legend.text = element_text(size = 6)  # Adjust legend text size
  )

# Print the scatterplot with smaller legend
print(scatter_plot)

Here we see that for some artists there are less data points, and therefore our prediction is not doing very great, with a r-square pretty low. Also the prediction interval would be very huge because of our uncertainty.

4. Using Bayesian Model to model the data

\[ \begin{align*} \text{likelihood:} \qquad Y_{i,j} \mid \mu_j, \sigma_w, \beta, X_{i,j} \sim \mathcal{N}(\mu_j + \beta X_{i,j}, \sigma^2_w) \\ \text{where} \quad \mu_j = \mu + b_j \\ \text{priors:} \quad b_j \mid \sigma_\mu \sim \mathcal{N}(0, \sigma_\mu^2) \\ \qquad \mu \sim \mathcal{N}(50, 52^2) \\ \qquad \beta \sim \mathcal{N}(0, 1) \\ \qquad \sigma_w \sim \text{Exponential}(1) \\ \qquad \sigma_\mu \sim \text{Exponential}(0.048). \end{align*} \]

In this model, \(Y_{i,j}\) represents the popularity score of the \(i\)-th song by the \(j\)-th artist,

\(X_{i,j}\) represents the danceability score of the \(i\)-th song by the \(j\)-th artist.

The popularity score of the i-th song is normally distributed with a mean that is the sum of the overall mean popularity, the artist-specific deviation in popularity, and the product of the danceability score and a coefficient. The standard deviation of this distribution is the within-artist standard deviation for popularity (within_sigma_popularity).

The mean \(\mu_j\) for each artist is modeled as a global mean \(\mu\) plus an artist-specific deviation \(b_j\). The artist-specific deviations are assumed to be normally distributed with mean 0 and between-artist standard deviation \(\sigma_\mu\).

The priors for the global mean \(\mu\), the coefficient \(\beta\) of the danceability score, the within-artist standard deviation \(\sigma_w\), and the between-artist standard deviation \(\sigma_\mu\) are also specified. In the prior, it is estimated that the between artist variability would be higher than the within artist variability.

4.1 Visualizing Priors

prior_normal_50_52 <- ggplot() +
  xlim(-200, 300) +
  ylim(0, 0.01) +
  geom_function(fun = dnorm, args = list(mean = 50, sd = 52), linewidth = 1) +
  theme(
    plot.title = element_text(size = 16),
    axis.text.x = element_text(size = 12, angle = 0),
    axis.text.y = element_text(size = 12, angle = 0),
    axis.title = element_text(size = 12),
  ) +
  labs(y = "Density", x = expression(mu)) +
  ggtitle(expression(paste("Prior Normal(50, ", 52^2, ")")))

prior_exp_0.048 <- ggplot() +
  xlim(0, 140) +
  ylim(0, 0.05) +
  geom_function(fun = dexp, args = list(rate = 0.048), linewidth = 1) +
  theme(
    plot.title = element_text(size = 16),
    axis.text.x = element_text(size = 12, angle = 0),
    axis.text.y = element_text(size = 12, angle = 0),
    axis.title = element_text(size = 12),
  ) +
  labs(y = "Density", x = expression(sigma[mu])) +
  ggtitle(expression("Prior Exponential(0.048)"))

prior_exp_1 <- ggplot() +
  xlim(0, 5) +
  ylim(0, 1) +
  geom_function(fun = dexp, args = list(rate = 1), linewidth = 1) +
  theme(
    plot.title = element_text(size = 16),
    axis.text.x = element_text(size = 12, angle = 0),
    axis.text.y = element_text(size = 12, angle = 0),
    axis.title = element_text(size = 12),
  ) +
  labs(y = "Density", x = expression(sigma["w"])) +
  ggtitle(expression("Prior Exponential(1)"))
plot_grid(prior_normal_50_52, prior_exp_0.048, prior_exp_1)

4.2 Code the Model on Stan

// YOUR CODE HERE
data {                          
  int<lower=1> n;                     //rows in training set
  int<lower=1> num_artist;            //number of artists in training set
  int<lower=1> artist[n];             //artist ID column by song in training set
  vector[n] dance_score;              //danceability scores by song in training set
  vector[n] popularity_score;         //popularity scores by song in training set
}
parameters {
  vector[num_artist] mean_dance;      //vector of posterior danceability score deviations for each artist (size num_artist)
  vector[num_artist] mean_popularity; //vector of posterior popularity score deviations for each artist (size num_artist)
  real overall_mean_dance;            //overall mean in danceability score
  real overall_mean_popularity;       //overall mean in popularity score
  real<lower=0> between_sigma_dance;  //between-artist sd in danceability score
  real<lower=0> between_sigma_popularity; //between-artist sd in popularity score
  real<lower=0> within_sigma_dance;   //within-artist sd in danceability score
  real<lower=0> within_sigma_popularity;  //within-artist sd in popularity score
  real beta;                          //coefficient for danceability score
}
model {
  between_sigma_dance ~ exponential(0.048);     //between-artist sd prior for danceability
  between_sigma_popularity ~ exponential(0.048); //between-artist sd prior for popularity
  within_sigma_dance ~ exponential(1);  //within-artist sd prior for danceability
  within_sigma_popularity ~ exponential(1); //within-artist sd prior for popularity
  overall_mean_dance ~ normal(50, 52);      //overall mean prior for danceability
  overall_mean_popularity ~ normal(50, 52); //overall mean prior for popularity
  beta ~ normal(0, 1);                      //prior for the coefficient of danceability score
  for (j in 1:num_artist){            //danceability and popularity scores priors (deviations per artist)
    mean_dance[j] ~ normal(0, between_sigma_dance);
    mean_popularity[j] ~ normal(0, between_sigma_popularity);
  }
  for (i in 1:n){
    int artist_index = artist[i];     //auxiliary indexing variable
    dance_score[i] ~ normal(overall_mean_dance + mean_dance[artist_index], within_sigma_dance); //likelihood in training set for danceability
    popularity_score[i] ~ normal(overall_mean_popularity + mean_popularity[artist_index] + beta * dance_score[i], within_sigma_popularity); //likelihood in training set for popularity
  }
}

4.3 Sample and Visualize the Posterior

levels(spotify_training$artist) <- 1:44

spotify_dictionary <- list(
  n = nrow(spotify_training),
  num_artist = nrow(artist_catalogue),
  artist = as.integer(spotify_training$artist),
  dance_score = spotify_training$danceability,
  popularity_score = spotify_training$popularity
)

# YOUR CODE HERE
posterior_spotify <- sampling(
  object = spotify_stan_model,
  data = spotify_dictionary,
  chains = 4,
  iter = 25000,
  warmup = 5000,
  thin = 20,
  seed = 553,
  cores = getOption("mc.cores", 4)
)

posterior_spotify_sampling <- as.data.frame(posterior_spotify)
summary_overall <- as.data.frame(summary(posterior_spotify)$summary)
summary_overall <- summary_overall[89:94, c("mean", "sd", "2.5%", "97.5%")] %>%
  mutate_if(is.numeric, round, 3)
summary_overall

Observation

  • The overall danceability is between 61.956 and 68.849 with 95% probability and a posterior mean of 65.316.

  • The overall popularity is less than danceability between 41.908 and 59.915 with 95% probability and a posterior mean of 51.107. The credible interval is larger because it incorporates the danceability in estimation.

  • Moreover, we see more variability associated to the between-artist standard deviation which align with our assumption. On the other hand, the within-artist variability shows less variability.

summary_artist_means <- as.data.frame(summary(posterior_spotify)$summary)
summary_artist_means <- summary_artist_means[45:88, c("mean", "sd", "2.5%", "97.5%")] %>%
  mutate_if(is.numeric, round, 3)
summary_artist_means$artist <- as.factor(artist_catalogue$artist)
summary_artist_means %>%
  arrange(-mean) 
posterior_artist_means_CIs_plot <- summary_artist_means %>%
  arrange(-mean) %>%
  slice(1:10) %>%
  mutate(artist = fct_reorder(artist, mean)) %>%
  ggplot(aes(x = mean, y = artist)) +
  geom_errorbarh(aes(xmax = `2.5%`, xmin = `97.5%`, color = artist)) +
  geom_point(color = "black") +
  theme(
    plot.title = element_text(size = 16, face = "bold"),
    axis.text = element_text(size = 12),
    axis.title = element_text(size = 12),
    legend.position = "none"
  ) +
  ggtitle("95% CIs for Artist Mean Deviations") +
  labs(x = "Artist Mean Deviation", y = "Artist") +
  geom_vline(xintercept = 0, linetype = "dashed")

See which artist has a higher popularity than the mean

posterior_artist_means_CIs_plot

From the above plot, we can see the artists that has a higher popularity compared to the overall mean popularity for the songs on the platform, since their credible intervals with positive bounds.

4.4 MCMC Diagnosis

color_scheme_set("mix-blue-pink")

trace_spotify_4_chains <- mcmc_trace(posterior_spotify,
  pars = c(
    "overall_mean_dance",
    "overall_mean_popularity",
    "between_sigma_dance",
    "between_sigma_popularity",
    "within_sigma_dance",
    "within_sigma_popularity"),
  size = 0.4,
  facet_args = list(nrow = 6)
) +
  ggtitle("Trace Plots by Parameter of Interest") +
  theme(
    plot.title = element_text(size = 16, face = "bold", family = "sans"),
    axis.text = element_text(size = 12, family = "sans"),
    axis.title = element_text(size = 12, family = "sans"),
    legend.text = element_text(size = 12, family = "sans"),
    legend.title = element_text(size = 12, family = "sans")
  ) +
  facet_text(size = 12)
trace_spotify_4_chains 

We are seeing a flat pattern by parameter without any upward or a downward trend throughout the chain. Moreover, there isn’t any chain stuck. Suggesting the sampling has gone well.

4.5 Prediction of all songs (Suppose we do not have any information)

data {                          
  int<lower=1> n;                     
  int<lower=1> num_artist;            
  int<lower=1> artist[n];             
  vector[n] dance_score;              
  vector[n] popularity_score;         
}
parameters {
  vector[num_artist] mean_dance;      
  vector[num_artist] mean_popularity; 
  real overall_mean_dance;            
  real overall_mean_popularity;       
  real<lower=0> between_sigma_dance;  
  real<lower=0> between_sigma_popularity;
  real<lower=0> within_sigma_dance;   
  real<lower=0> within_sigma_popularity;
  real beta;                          
}
model {
  between_sigma_dance ~ exponential(0.048);     
  between_sigma_popularity ~ exponential(0.048);
  within_sigma_dance ~ exponential(1);  
  within_sigma_popularity ~ exponential(1);
  overall_mean_dance ~ normal(50, 52);      
  overall_mean_popularity ~ normal(50, 52);
  beta ~ normal(0, 1);                      
  for (j in 1:num_artist){            
    mean_dance[j] ~ normal(0, between_sigma_dance);
    mean_popularity[j] ~ normal(0, between_sigma_popularity);
  }
  for (i in 1:n){
    int artist_index = artist[i];     
    dance_score[i] ~ normal(overall_mean_dance + mean_dance[artist_index], within_sigma_dance);
    popularity_score[i] ~ normal(overall_mean_popularity + mean_popularity[artist_index] + beta * dance_score[i], within_sigma_popularity);
  }
}
generated quantities {
  vector[n] popularity_score_pred;
  for (i in 1:n){
    int artist_index = artist[i];
    popularity_score_pred[i] = normal_rng(overall_mean_popularity + mean_popularity[artist_index] + beta * dance_score[i], within_sigma_popularity);
  }
}
posterior_spotify_pred <- sampling(
  object = spotify_pred_model,
  data = spotify_dictionary,
  chains = 1,
  iter = 25000,
  warmup = 5000,
  thin = 20,
  seed = 553
)
## 
## SAMPLING FOR MODEL 'anon_model' NOW (CHAIN 1).
## Chain 1: 
## Chain 1: Gradient evaluation took 0.000149 seconds
## Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 1.49 seconds.
## Chain 1: Adjust your expectations accordingly!
## Chain 1: 
## Chain 1: 
## Chain 1: Iteration:     1 / 25000 [  0%]  (Warmup)
## Chain 1: Iteration:  2500 / 25000 [ 10%]  (Warmup)
## Chain 1: Iteration:  5000 / 25000 [ 20%]  (Warmup)
## Chain 1: Iteration:  5001 / 25000 [ 20%]  (Sampling)
## Chain 1: Iteration:  7500 / 25000 [ 30%]  (Sampling)
## Chain 1: Iteration: 10000 / 25000 [ 40%]  (Sampling)
## Chain 1: Iteration: 12500 / 25000 [ 50%]  (Sampling)
## Chain 1: Iteration: 15000 / 25000 [ 60%]  (Sampling)
## Chain 1: Iteration: 17500 / 25000 [ 70%]  (Sampling)
## Chain 1: Iteration: 20000 / 25000 [ 80%]  (Sampling)
## Chain 1: Iteration: 22500 / 25000 [ 90%]  (Sampling)
## Chain 1: Iteration: 25000 / 25000 [100%]  (Sampling)
## Chain 1: 
## Chain 1:  Elapsed Time: 15.821 seconds (Warm-up)
## Chain 1:                44.773 seconds (Sampling)
## Chain 1:                60.594 seconds (Total)
## Chain 1:
summary_pred <- as.data.frame(summary(posterior_spotify_pred)$summary)
summary_pred  <- summary_pred [96:445,c("mean")]
# Calculate the total sum of squares
sst <- sum((spotify_training$popularity - mean(spotify_training$popularity))^2)

# Calculate the residual sum of squares
ssr <- sum((spotify_training$popularity - summary_pred)^2)

# Calculate the R-squared score
r2_score <- 1 - (ssr / sst)

# Print the R-squared score
print(paste("The R-squared score is", r2_score))
## [1] "The R-squared score is 0.589440319090849"